pf <- read.csv("pseudo_facebook.tsv", sep = "\t")
library(ggplot2)
library(gridExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
p1 <- ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", shape = 4)
p2 <- ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = "summary", fun.y = median)
grid.arrange(p1, p2, ncol = 1)
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n= n()) %>%
arrange(age)
head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
##
## # A tibble: 6 x 5
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <dbl> <int>
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
## 5 15 female 538.6813 276.0 1139
## 6 15 male 200.6658 106.5 1478
ggplot(aes(x = age, y = median_friend_count),
data = pf.fc_by_age_gender) +
geom_line(aes(color = gender))
Notes: Use of reshape2 or tidyr (easier)
pf.fc_by_age_gender.wide = pf.fc_by_age_gender %>%
select(age, gender, median_friend_count) %>%
filter(!is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = female / male)
head(pf.fc_by_age_gender.wide)
## Source: local data frame [6 x 4]
## Groups: age [6]
##
## # A tibble: 6 x 4
## age female male ratio
## <int> <dbl> <dbl> <dbl>
## 1 13 148.0 55.0 2.690909
## 2 14 224.0 92.5 2.421622
## 3 15 276.0 106.5 2.591549
## 4 16 258.5 136.0 1.900735
## 5 17 245.5 125.0 1.964000
## 6 18 243.0 122.0 1.991803
ggplot(aes(x = age, y = ratio), data = pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)
pf$year_joined = floor(2014 - pf$tenure / 365)
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket <- cut(pf$year_joined,
c(2004, 2009, 2011, 2012, 2014))
table(pf$year_joined.bucket, useNA = "ifany")
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] <NA>
## 6669 15308 33366 43658 2
Notes: Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age.
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = "summary", fun.y = median)
Notes: Plotting the grand mean is a good reminder that much of the data in the sample is about members of recent cohorts.
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color = year_joined.bucket),
stat = "summary", fun.y = mean) +
geom_line(stat = "summary", fun.y = mean, linetype = 2)
Notes: Max 417 is definetely an outlier since 3rd quantile is only about 0.57.
with(subset(pf, tenure >= 1), summary(friend_count / tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
Notes: Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
What is the median friend rate? 0.2205
What is the maximum friend rate? 417
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary", fun.y = mean)
Notes: Users with more tenure typically initiate less friendships.
Notes:
p3 = ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p4 = ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p5 = ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p6 = ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p7 = ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
grid.arrange(p3, p4, p5, p6, p7, ncol = 1)
## `geom_smooth()` using method = 'gam'
Notes: Bias Variance Trade Off
yo = read.csv("yogurt.csv")
# Change the id from an int to a factor
yo$id = factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
p8 = ggplot(aes(price), data = yo) +
geom_histogram(fill = "orange")
p9 = ggplot(aes(price), data = yo) +
geom_histogram(fill = "orange", binwidth = 10)
grid.arrange(p8, p9, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We notice * Important discreetness * For this very discreet data, this histogram is a very biased model.
Notes: See discreetness with summary or by length and unique. The 3rd quantile is about the same with the maximum.
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
unique(yo$price)
## [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
length(unique(yo$price))
## [1] 20
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
yo = transform(yo, all.purchases = strawberry + blueberry +
pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
Notes: The modal or most common prices seem to be increasing over time. We also see some lower prices scattered about the graph. These may be due to sales or buyers using coupons that bring the prices down.
ggplot(aes(all.purchases), data = yo) +
geom_histogram(fill = "cyan", binwidth = 1)
ggplot(aes(time, price), data = yo) +
geom_jitter(alpha = 1/4, shape = 21, fill = "brown")
Notes: Take a closer look applying sub-sampling.
Notes: x %in% y returns a logical (boolean) vector the same length as x that says whether each entry in x appears in y. That is, for each entry in x, it checks to see whether it is in y. Use the pch or shape parameter to specify the symbol when plotting points. Using the size parameter, we add more detail to the plot.
# Set the seed for reproducible results
set.seed(4230)
sample.ids = sample(levels(yo$id), 16)
ggplot(aes(time, price), data = subset(yo, id %in% sample.ids)) +
facet_wrap(~ id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
Notes: The general idea is that we have observations over time, we can face by the primary unit, case or individual in the data set. For yogurt data, it was the households we were faceting over. The same plot cannot be generated with the pseudo Facebook data set, since we don’t have data on our sample of users over time.
Notes: If the plot takes a long time to render or if you want to see some of the scatterplot matrix, then only examine a smaller number of variables. You can use the following code or select fewer variables. We recommend including gender (the 6th variable)!
pf_subset <- pf[ , c(2:7)]
You may also find that variable labels are on the outer edges of the scatterplot matrix, rather than on the diagonal. If you want labels in the diagonal, you can set the axisLabels = ‘internal’ argument in your ggpairs command.
# install.packages('GGally')
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(10))
# Set the seed for reproducible results
set.seed(1836)
pf_subset = pf[, c(2:15)]
names(pf_subset)
## [1] "age" "dob_day"
## [3] "dob_year" "dob_month"
## [5] "gender" "tenure"
## [7] "friend_count" "friendships_initiated"
## [9] "likes" "likes_received"
## [11] "mobile_likes" "mobile_likes_received"
## [13] "www_likes" "www_likes_received"
ggpairs(pf_subset)
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
Notes: Micro-array/Gene Expression Data
nci = read.table("nci.tsv")
# Change the colnames to produce a nicer plot
colnames(nci) = c(1:64)
Notes:
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection * Simple extensions to scatter plots and plots of conditional summaries * Scatter plot matrices and heat maps * Reshape data (long <—–> wide)
data("diamonds")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(aes(x = price), data = diamonds) +
geom_histogram(aes(fill = cut)) +
facet_wrap(~ color) +
scale_fill_brewer(palette = "Spectral") +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(aes(x = table, y = price, color = cut), data = diamonds) +
geom_point() +
scale_color_brewer(type = 'qual') +
scale_x_continuous(limits = c(50, 80), breaks = seq(50, 80, 2))
## Warning: Removed 5 rows containing missing values (geom_point).
diamonds$volume = diamonds$x * diamonds$y * diamonds$z
ggplot(data = subset(diamonds, volume != 0 & diamonds$volume <= quantile(diamonds$volume, 0.99)),
aes(x = volume, y = price)) +
geom_point(aes(color = clarity)) +
scale_color_brewer(type = 'div') +
scale_y_log10()
pf <- transform(pf,prop_initiated = ifelse(friend_count == 0,
0, friendships_initiated/friend_count))
ggplot(aes(x = tenure, y = prop_initiated),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary", fun.y = median)
ggplot(aes(x = tenure, y = prop_initiated),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
## `geom_smooth()` using method = 'gam'
lg_prop_init <- subset(pf, year_joined.bucket == '(2012,2014]')
summary(lg_prop_init$prop_initiated)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5000 0.6912 0.6430 0.8438 1.0000
ggplot(data = diamonds, aes(x = cut, y = price/carat, color = color)) +
geom_jitter(alpha = 0.3) +
facet_wrap(~ clarity, ncol = 2) +
scale_color_brewer(type = 'div')